package storm.cookbook.tfidf; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import backtype.storm.Config; import backtype.storm.LocalCluster; import backtype.storm.LocalDRPC; import backtype.storm.StormSubmitter; import backtype.storm.generated.StormTopology; import backtype.storm.topology.TopologyBuilder; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Values; import storm.cookbook.tfidf.bolt.PublishURLBolt; import storm.cookbook.tfidf.functions.BatchCombiner; import storm.cookbook.tfidf.functions.DocumentFetchFunction; import storm.cookbook.tfidf.functions.DocumentTokenizer; import storm.cookbook.tfidf.functions.PersistDocumentFunction; import storm.cookbook.tfidf.functions.SplitAndProjectToFields; import storm.cookbook.tfidf.functions.TermFilter; import storm.cookbook.tfidf.functions.TfidfExpression; import storm.cookbook.tfidf.spout.TwitterSpout; import storm.cookbook.tfidf.state.TimeBasedRowStrategy; import storm.trident.Stream; import storm.trident.TridentState; import storm.trident.TridentTopology; import storm.trident.operation.BaseFunction; import storm.trident.operation.TridentCollector; import storm.trident.operation.builtin.Count; import storm.trident.operation.builtin.FilterNull; import storm.trident.operation.builtin.MapGet; import storm.trident.spout.ITridentSpout; import storm.trident.state.StateFactory; import storm.trident.testing.FixedBatchSpout; import storm.trident.testing.Split; import storm.trident.tuple.TridentTuple; import trident.cassandra.CassandraState; public class TermTopology { static Logger LOG = LoggerFactory.getLogger(TermTopology.class); private static String[] searchTerms = new String[] { "AAPL", "Mac", "iPhone", "iStore", "Apple" }; private static String[] mimeTypes = new String[] { "application/pdf", "text/html", "text/plain" }; private static StateFactory getStateFactory(String rowKey) { CassandraBucketState.BucketOptions options = new CassandraBucketState.BucketOptions(); options.keyspace = "storm"; options.columnFamily = "tfidf"; options.rowKey = rowKey; options.keyStrategy = new TimeBasedRowStrategy(); return CassandraBucketState.nonTransactional("localhost", options); } private static StateFactory getBatchStateFactory(String rowKey) { CassandraState.Options options = new CassandraState.Options(); options.keyspace = "storm"; options.columnFamily = "tfidfbatch"; options.rowKey = rowKey; return CassandraState.nonTransactional("localhost", options); } public static StormTopology getTwitterTopology() { TopologyBuilder builder = new TopologyBuilder(); builder.setSpout("twitterSpout", new TwitterSpout(searchTerms, 1000), 1); builder.setBolt("publishBolt", new PublishURLBolt(), 2).shuffleGrouping("twitterSpout"); return builder.createTopology(); } public static class PrintlnFunction extends BaseFunction { // @Override public void execute(TridentTuple tuple, TridentCollector collector) { System.out.println("New Tuple for printing: " + tuple.toString()); collector.emit(new Values("dummy")); } } public static class StaticSourceFunction extends BaseFunction { private String source; public StaticSourceFunction(String source) { this.source = source; } // @Override public void execute(TridentTuple tuple, TridentCollector collector) { LOG.debug("Emitting static value"); collector.emit(new Values(source)); } } public static Stream getUrlStream(TridentTopology topology, ITridentSpout spout) { Stream urlStream = null; if (spout == null) { FixedBatchSpout testSpout = new FixedBatchSpout(new Fields("url"), 1, new Values("doc01"), new Values("doc02"), new Values("doc03"), new Values("doc04"), new Values("doc05")); testSpout.setCycle(true); urlStream = topology.newStream("spout1", testSpout); } else { urlStream = topology.newStream("spout1", spout); } return urlStream.parallelismHint(16); } public static void addDQueryStream(TridentState state, TridentTopology topology, LocalDRPC drpc) { topology.newDRPCStream("dQuery", drpc).each(new Fields("args"), new Split(), new Fields("source")) .stateQuery(state, new Fields("source"), new MapGet(), new Fields("d")) .each(new Fields("d"), new FilterNull()).project(new Fields("source", "d")); } private static void addDFQueryStream(TridentState dfState, TridentTopology topology, LocalDRPC drpc) { topology.newDRPCStream("dfQuery", drpc).each(new Fields("args"), new Split(), new Fields("term")) .stateQuery(dfState, new Fields("term"), new MapGet(), new Fields("df")) .each(new Fields("df"), new FilterNull()).project(new Fields("term", "df")); } private static void addTFIDFQueryStream(TridentState tfState, TridentState dfState, TridentState dState, TridentTopology topology, LocalDRPC drpc) { TridentState batchDfState = topology.newStaticState(getBatchStateFactory("df")); TridentState batchDState = topology.newStaticState(getBatchStateFactory("d")); TridentState batchTfState = topology.newStaticState(getBatchStateFactory("tf")); topology.newDRPCStream("tfidfQuery", drpc) .each(new Fields("args"), new SplitAndProjectToFields(), new Fields("documentId", "term")) .each(new Fields(), new StaticSourceFunction("twitter"), new Fields("source")) .stateQuery(tfState, new Fields("documentId", "term"), new MapGet(), new Fields("tf_rt")) .stateQuery(dfState, new Fields("term"), new MapGet(), new Fields("df_rt")) .stateQuery(dState, new Fields("source"), new MapGet(), new Fields("d_rt")) .stateQuery(batchTfState, new Fields("documentId", "term"), new MapGet(), new Fields("tf_batch")) .stateQuery(batchDfState, new Fields("term"), new MapGet(), new Fields("df_batch")) .stateQuery(batchDState, new Fields("source"), new MapGet(), new Fields("d_batch")) .each(new Fields("tf_rt", "df_rt", "d_rt", "tf_batch", "df_batch", "d_batch"), new BatchCombiner(), new Fields("tf", "d", "df")) .each(new Fields("term", "documentId", "tf", "d", "df"), new TfidfExpression(), new Fields("tfidf")) .each(new Fields("tfidf"), new FilterNull()).project(new Fields("documentId", "term", "tfidf")); } @SuppressWarnings("rawtypes") public static TridentTopology buildTopology(ITridentSpout spout, LocalDRPC drpc) { TridentTopology topology = new TridentTopology(); Stream documentStream = getUrlStream(topology, spout).each(new Fields("url"), new DocumentFetchFunction(mimeTypes), new Fields("document", "documentId", "source")); documentStream.each(new Fields("documentId", "document"), new PersistDocumentFunction(), new Fields()); Stream termStream = documentStream.parallelismHint(20) .each(new Fields("document"), new DocumentTokenizer(), new Fields("dirtyTerm")) .each(new Fields("dirtyTerm"), new TermFilter(), new Fields("term")) .project(new Fields("term", "documentId", "source")); TridentState dfState = termStream.groupBy(new Fields("term")).persistentAggregate(getStateFactory("df"), new Count(), new Fields("df")); addDFQueryStream(dfState, topology, drpc); TridentState dState = documentStream.groupBy(new Fields("source")).persistentAggregate(getStateFactory("d"), new Count(), new Fields("d")); addDQueryStream(dState, topology, drpc); TridentState tfState = termStream.groupBy(new Fields("documentId", "term")) .persistentAggregate(getStateFactory("tf"), new Count(), new Fields("tf")); addTFIDFQueryStream(tfState, dfState, dState, topology, drpc); return topology; } public static void main(String[] args) throws Exception { Config conf = new Config(); conf.setMaxSpoutPending(20); conf.put(Conf.REDIS_HOST_KEY, "localhost"); conf.put(Conf.REDIS_PORT_KEY, Conf.DEFAULT_JEDIS_PORT); conf.put("DOCUMENT_PATH", "document.avro"); if (args.length == 0) { LocalDRPC drpc = new LocalDRPC(); LocalCluster cluster = new LocalCluster(); conf.setDebug(true); TridentTopology topology = buildTopology(null, drpc); cluster.submitTopology("tfidf", conf, topology.build()); for (int i = 0; i < 100; i++) { System.out.println("About to query!"); System.out.println("DRPC RESULT: " + drpc.execute("tfidfQuery", "doc01 area")); Thread.sleep(1000); } } else { conf.setNumWorkers(6); StormSubmitter.submitTopology("twitter", conf, getTwitterTopology()); // TODO: Create the twitter spout and pass it in here... StormSubmitter.submitTopology(args[0], conf, buildTopology(null, null).build()); } } }